In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
from matplotlib import pylab
import sys
import yaml
import numbers
import os
import plotly
import matplotlib.pyplot as plt
import scvelo as scv
from tqdm.notebook import tqdm
import itertools
import random
import seaborn as sns

import rpy2
import rpy2.robjects as ro

warnings.filterwarnings('ignore')
In [2]:
%load_ext rpy2.ipython
In [3]:
%%R
library(edgeR)
library(org.Hs.eg.db)
library(AnnotationDbi)
library(stats)
R[write to console]: Loading required package: limma

R[write to console]: Loading required package: AnnotationDbi

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following object is masked from ‘package:limma’:

    plotMA


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


R[write to console]: Loading required package: Biobase

R[write to console]: Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


R[write to console]: Loading required package: IRanges

R[write to console]: Loading required package: S4Vectors

R[write to console]: 
Attaching package: ‘S4Vectors’


R[write to console]: The following object is masked from ‘package:base’:

    expand.grid


R[write to console]: 

In [4]:
%matplotlib inline
In [5]:
sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (10, 10)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
In [ ]:

In [6]:
plotly.offline.init_notebook_mode()
In [7]:
%matplotlib inline
In [8]:
sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (9, 9)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5

Configure paths¶

In [9]:
hostRoot = "-".join(socket.gethostname().split('-')[0:2])

with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f:
    paths = yaml.load(f, Loader=yaml.FullLoader)

#indir=paths["paths"]["indir"][hostRoot]
FinaLeaf="/Neurons"
outdir="./outdir/"
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]
with open("./colorMap.yaml", 'r') as f:
    colorMap = yaml.load(f, Loader=yaml.FullLoader)["uns_colors"]
colorMap
Out[9]:
{'medial': {'color': '#CD5C5C'},
 'distal': {'color': '#FFCBCB'},
 'proximal': {'color': '#8D021F'},
 'piece1': {'color': '#281E5D'},
 'piece2': {'color': '#3779FF'},
 'piece3': {'color': '#BFD4FF'},
 'control': {'color': '#0056D1'},
 'polaroid': {'color': '#DE001E'},
 'enriched': {'color': '#DE001E'},
 'not_enriched': {'color': '#0056D1'},
 'pfc': {'color': '#DE001E'},
 'somatosensory': {'color': '#E5E4E2'},
 'temporal': {'color': '#0056D1'},
 'motor': {'color': '#37F7C8'},
 'v1': {'color': '#28F30C'},
 'parietal': {'color': '#D41FFC'}}

Cells selection and pBulking¶

In [10]:
adata = sc.read_h5ad(outdir+FinaLeaf+"/4A_Neurons_DA.h5ad")
adataraw = sc.read_h5ad(outdir+"/1_polaroid_mint.h5ad")[adata.obs_names]
adataraw.obs = adata.obs
adata = adataraw
In [11]:
sc.pp.normalize_total(adata)
normalizing counts per cell
    finished (0:00:00)
In [12]:
adata.obs
Out[12]:
dataset organoid region type type_region regionContrast n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts ... leiden0.3 S_score G2M_score phase subLeiden umap_density_type umap_density_region umap_density_organoid DiffAbundant leiden_partition_final
AAACGAACATTCAGCA-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 3191 8.068403 7746.0 8.955061 ... 2 -0.128590 -0.127933 G1 1 0.773493 0.685443 0.741122 0 not_enriched
AACAAGATCGAACCTA-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 2411 7.788212 5934.0 8.688622 ... 2 -0.174303 -0.211261 G1 1 0.243025 0.190615 0.295339 0 not_enriched
AACCAACAGGACTTCT-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 942 6.849066 1343.0 7.203405 ... 2 -0.158302 -0.146888 G1 1 0.585016 0.606536 0.665529 0 not_enriched
AACCACAAGGATGTTA-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 1700 7.438972 3382.0 8.126518 ... 2 -0.017421 -0.150694 G1 0 0.280893 0.337840 0.210278 0 not_enriched
AACCTGAAGTCAGGGT-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 2189 7.691657 4256.0 8.356320 ... 2 -0.099218 -0.105115 G1 0 0.657916 0.486592 0.829882 0 not_enriched
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTGCATTAGGGCAATC-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 4158 8.333030 12796.0 9.456966 ... 8 -0.159563 -0.157904 G1 4 0.853591 0.693369 0.787649 1 enriched
TTGTGTTAGATCGACG-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 2715 7.906915 5878.0 8.679142 ... 8 -0.108142 -0.146919 G1 2 0.887941 0.689022 0.919148 1 enriched
TTGTTCACATCAGTCA-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 1589 7.371489 4025.0 8.300529 ... 8 -0.085110 0.002994 G2M 4 0.453936 0.387748 0.400370 1 enriched
TTTGATCGTATGATCC-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 4502 8.412499 16372.0 9.703389 ... 2 -0.175276 -0.111532 G1 1 0.066308 0.021418 0.063423 0 not_enriched
TTTGGAGTCGTCAAAC-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 3663 8.206311 9542.0 9.163563 ... 8 -0.123037 -0.117675 G1 4 0.787412 0.870449 0.255348 1 enriched

3650 rows × 29 columns

In [13]:
adata.write_h5ad(outdir+FinaLeaf+"/5A_Neurons_preBulk.h5ad")

by region¶

In [14]:
groupingCovariate = "region"
PseudooReplicates_per_group = 10
print("Pbulking with target of "+str(PseudooReplicates_per_group)+" PRs will result in following cells per PR")
adata.obs.groupby(groupingCovariate).size() / PseudooReplicates_per_group
Pbulking with target of 10 PRs will result in following cells per PR
Out[14]:
region
distal       48.8
medial       27.4
piece1       58.6
piece2       54.5
piece3       64.4
proximal    111.3
dtype: float64
In [15]:
total = pd.DataFrame(index = adata.var_names)
total_metadata = pd.DataFrame(index= ["_".join(Rep) for Rep in  list(itertools.product(adata.obs[groupingCovariate].cat.categories.tolist(), [str(r)  for r in list(range(PseudooReplicates_per_group))]))])
for group in adata.obs[groupingCovariate].cat.categories:
    groupAdata = adata[adata.obs[groupingCovariate] == group]
    
    group_cells = groupAdata.obs_names.tolist()
    random.Random(4).shuffle(group_cells)
    
    metaCellslist=[group_cells[i::PseudooReplicates_per_group] for i in range(PseudooReplicates_per_group)]
    
    for partition in enumerate(metaCellslist):
        
        total['{}_{}'.format(group, partition[0])] = adata[partition[1]].X.sum(axis = 0).A1
    
        total_metadata.loc['{}_{}'.format(group, partition[0]),"group"] = group
        total_metadata.loc['{}_{}'.format(group, partition[0]),"pseudoreplicate"] = partition[0]
        total_metadata.loc['{}_{}'.format(group, partition[0]),"number_of_cell"] = int(len(partition[1]))
    
    #With this line we propagate - whenever possible -  obs to aggregated pReplicate
    for obsMD in [obsMD for obsMD in groupAdata.obs.columns.tolist() if len(groupAdata.obs[obsMD].unique()) == 1 and obsMD != groupingCovariate]:
        total_metadata.loc[["_".join(l) for l in list(itertools.product([group],[str(r)  for r in list(range(PseudooReplicates_per_group))]))], obsMD ] = adata.obs.loc[adata.obs[groupingCovariate] == group,obsMD][0]
        
        
total_metadata = total_metadata.dropna(axis = 1)



adata_bulk = sc.AnnData(total.transpose()).copy()
adata_bulk.var = adata.var.copy()
adata_bulk.obs = pd.concat([adata_bulk.obs, total_metadata], axis = 1)


adata_bulk.obs["group"] =adata_bulk.obs["group"].astype("category")
with open("./colorMap.yaml", 'r') as f:
    colorMap = yaml.load(f, Loader=yaml.FullLoader)["uns_colors"]
colorMap
adata_bulk.uns["group_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["group"].cat.categories]

adata_bulk.obs["regionContrast"] = adata_bulk.obs["regionContrast"].astype("category")
adata_bulk.uns["regionContrast_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["regionContrast"].cat.categories]



adata_bulk.write_h5ad(outdir+FinaLeaf+"/5A_Neurons_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.h5ad")
total.to_csv(outdir+FinaLeaf+"/5A_Neurons_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.tsv", sep="\t")
... storing 'type' as categorical
... storing 'type_region' as categorical
... storing 'is.Stressed' as categorical
In [16]:
adata_bulk
Out[16]:
AnnData object with n_obs × n_vars = 60 × 33542
    obs: 'group', 'pseudoreplicate', 'number_of_cell', 'type', 'type_region', 'regionContrast', 'is.Stressed'
    var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'group_colors', 'regionContrast_colors'
In [5]:
import ipynbname
nb_fname = ipynbname.name()
In [6]:
%%bash -s "$nb_fname"
sleep 120
jupyter nbconvert "$1".ipynb --to="python" --ClearOutputPreprocessor.enabled=True
jupyter nbconvert "$1".ipynb --to="html"
[NbConvertApp] Converting notebook 05A_Neurons_pBulk.ipynb to python
[NbConvertApp] Writing 8073 bytes to 05A_Neurons_pBulk.py
[NbConvertApp] Converting notebook 05A_Neurons_pBulk.ipynb to html
[NbConvertApp] Writing 4300226 bytes to 05A_Neurons_pBulk.html
In [ ]: